And some package and labeling info
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(kohonen)
library(vegan) # for vegdist function which gives a dissimilarity index
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
timeseries_dir <- 'extracted_timeseries/'
# get ecs ordering/labels
esm_labels <- read.csv(paste0(timeseries_dir,'global_tas_allesms.csv'), stringsAsFactors = FALSE) %>%
select(esm) %>% distinct %>%
mutate(plotesm = paste0(letters[as.integer(row.names(.))], '.', esm),
ECS_order = as.integer(row.names(.)))
print(esm_labels)
## esm plotesm ECS_order
## 1 CAMS-CSM1-0 a.CAMS-CSM1-0 1
## 2 MIROC6 b.MIROC6 2
## 3 GFDL-ESM4 c.GFDL-ESM4 3
## 4 FGOALS-g3 d.FGOALS-g3 4
## 5 MPI-ESM1-2-HR e.MPI-ESM1-2-HR 5
## 6 MPI-ESM1-2-LR f.MPI-ESM1-2-LR 6
## 7 MRI-ESM2-0 g.MRI-ESM2-0 7
## 8 ACCESS-ESM1-5 h.ACCESS-ESM1-5 8
## 9 IPSL-CM6A-LR i.IPSL-CM6A-LR 9
## 10 CESM2-WACCM j.CESM2-WACCM 10
## 11 UKESM1-0-LL k.UKESM1-0-LL 11
## 12 CanESM5 l.CanESM5 12
process_time_series <- function(time_series_df, esm_label_info,
hist_start = 1995, hist_end = 2014){
# Get ensemble member values for projection runs:
# 1. 2080-2099 average
# 2. loess detrend each ensemble member to get IAV
#
# split by run
non_hist <- time_series_df %>%
filter(experiment != 'historical') %>%
select(year, ann_agg, esm, experiment, ensemble, variable, region)
grouped <- split(non_hist, f = list(non_hist$esm,
non_hist$experiment,
non_hist$ensemble,
non_hist$variable ,
non_hist$region) )
# split creates group of every possible combo of the variables and fills in
# empty dataframes for the ones that don't exist in data. Drop those
grouped <- grouped[lapply(grouped, nrow)>0]
processed_groups <- lapply(grouped, FUN = function(run_df){
loess_resids <- loess(run_df$ann_agg ~ run_df$year)$residuals
run_df %>%
filter(year >= 2080, year <= 2099) %>%
group_by(esm, experiment, ensemble, variable, region) %>%
summarise(average_2080_2099 = mean(ann_agg)) %>%
ungroup %>%
mutate(iasd = sd((loess_resids))) ->
output_df
return(output_df)
})
individual_stats <- do.call(bind_rows, processed_groups)
rm(non_hist)
rm(grouped)
rm(processed_groups)
# calculate ensemble averages
individual_stats %>%
group_by(esm, experiment, variable,region) %>%
summarise(average_2080_2099 = mean(average_2080_2099),
iasd = mean(iasd)) %>%
ungroup ->
ensemble_stats
# get ensemble average historical average value:
time_series_df %>%
filter(experiment == 'historical',
year >= hist_start,
year <= hist_end) %>%
group_by(esm, experiment, ensemble, variable, region) %>%
summarise(historical_average = mean(ann_agg)) %>%
ungroup %>%
group_by(esm, experiment, variable, region) %>%
summarise(historical_average = mean(historical_average)) %>%
ungroup %>%
select(-experiment) ->
historical_ens
# shape and calculate changes for plotting:
ensemble_stats %>%
left_join(historical_ens, by = c('esm', 'variable', 'region')) %>%
left_join(esm_label_info, by = 'esm') %>%
mutate(change = average_2080_2099 - historical_average,
pct_change = 100*(average_2080_2099 - historical_average)/historical_average) ->
plot_tbl
return(plot_tbl)
}
process_ens_time_series <- function(time_series_df, esm_label_info,
hist_start = 1995, hist_end = 2014){
# Get ensemble member values for projection runs:
# 1. 2080-2099 average
# 2. loess detrend each ensemble member to get IAV
#
# split by run
non_hist <- time_series_df %>%
filter(experiment != 'historical') %>%
select(year, ann_agg, esm, experiment, ensemble, variable, region)
grouped <- split(non_hist, f = list(non_hist$esm,
non_hist$experiment,
non_hist$ensemble,
non_hist$variable ,
non_hist$region) )
# split creates group of every possible combo of the variables and fills in
# empty dataframes for the ones that don't exist in data. Drop those
grouped <- grouped[lapply(grouped, nrow)>0]
processed_groups <- lapply(grouped, FUN = function(run_df){
loess_resids <- loess(run_df$ann_agg ~ run_df$year)$residuals
run_df %>%
filter(year >= 2080, year <= 2099) %>%
group_by(esm, experiment, ensemble, variable, region) %>%
summarise(average_2080_2099 = mean(ann_agg)) %>%
ungroup %>%
mutate(iasd = sd((loess_resids))) ->
output_df
return(output_df)
})
individual_stats <- do.call(bind_rows, processed_groups)
rm(non_hist)
rm(grouped)
rm(processed_groups)
# calculate ensemble averages
individual_stats ->
ensemble_stats
# get ensemble average historical average value:
time_series_df %>%
filter(experiment == 'historical',
year >= hist_start,
year <= hist_end) %>%
group_by(esm, experiment, ensemble, variable, region) %>%
summarise(historical_average = mean(ann_agg)) %>%
ungroup %>%
group_by(esm, experiment, variable, region) %>%
summarise(historical_average = mean(historical_average)) %>%
ungroup %>%
select(-experiment) ->
historical_ens
# shape and calculate changes for plotting:
ensemble_stats %>%
left_join(historical_ens, by = c('esm', 'variable', 'region')) %>%
left_join(esm_label_info, by = 'esm') %>%
mutate(change = average_2080_2099 - historical_average,
pct_change = 100*(average_2080_2099 - historical_average)/historical_average) ->
plot_tbl
return(plot_tbl)
}
prep_esm_TP_data <- function(esmname){
region_timeseries <- read.csv(paste0(timeseries_dir, 'IPCC_land_regions_tas_', esmname, '_timeseries_1980_2099.csv'),
stringsAsFactors = FALSE) %>% mutate(region = acronym)
region_tas_summary <- suppressMessages(process_time_series(time_series_df = region_timeseries, esm_label_info = esm_labels))
region_pr_summary <- suppressMessages(process_time_series(time_series_df =
read.csv(paste0(timeseries_dir, 'IPCC_land_regions_pr_', esmname,'_timeseries_1980_2099.csv'),
stringsAsFactors = FALSE) %>%
rename(ann_agg=pr) %>%
mutate(region = acronym) ,
esm_label_info = esm_labels))
# reshape so each row is an observation
# observation = esm - experiment - region - tas change-tas iasd - pr pct change
region_tas_summary %>%
select(esm, experiment, region, iasd, change) %>%
rename(tas_change = change) %>%
left_join(region_pr_summary %>%
select(esm, experiment, region, pct_change) %>%
rename(pr_pct = pct_change),
by = c('esm', 'experiment', 'region')) ->
region_summary
return(region_summary)
}
Consider an observation = esm - experiment - region : tas change-tas iasd - pr pct change
# region_summary <- prep_esm_TP_data(esmname = 'GFDL-ESM4')
# region_summary %>%
# bind_rows(prep_esm_TP_data(esmname = 'CESM2-WACCM')) %>%
# bind_rows(prep_esm_TP_data(esmname = 'CAMS-CSM1-0'))->
# region_summary_main
# write.csv(region_summary_main, 'CAMS_GFDL_CESM2-WACCM_summaries.csv', row.names = F)
region_summary_main <- read.csv('CAMS_GFDL_CESM2-WACCM_summaries.csv', stringsAsFactors = FALSE)
# make a copy to operate on
region_summary <- region_summary_main
print(head(region_summary))
## esm experiment region iasd tas_change pr_pct
## 1 GFDL-ESM4 ssp119 ARP 0.2652738 0.2610944 2.9927902
## 2 GFDL-ESM4 ssp119 CAF 0.2703431 0.5037959 -1.9838156
## 3 GFDL-ESM4 ssp119 CAR 0.1574689 0.2216722 -0.3187443
## 4 GFDL-ESM4 ssp119 CAU 0.5345113 0.4218907 -3.8398722
## 5 GFDL-ESM4 ssp119 CNA 0.4910504 0.9446315 6.3459839
## 6 GFDL-ESM4 ssp119 EAS 0.2584605 0.7250376 10.5204045
default SOM packages can only operate on numerical data, not categorical. So we have to assign some amount of spatial location numerical info to each acronym. Ideally mean lat-lon in the shape?
Also we want the shapes for plotting anyway, so prep them
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
shp <- st_read(dsn = 'IPCC-WGI-reference-regions-v4_shapefile/IPCC-WGI-reference-regions-v4.shp', stringsAsFactors = F)
## Reading layer `IPCC-WGI-reference-regions-v4' from data source
## `/Users/snyd535/Documents/GitHub/stitches_in_r/R/inst/shinyApp/python_curation/IPCC-WGI-reference-regions-v4_shapefile/IPCC-WGI-reference-regions-v4.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -90 xmax: 180 ymax: 90
## Geodetic CRS: WGS 84
# add a numerical region id
shp %>%
mutate(region_id = as.integer(row.names(.))) ->
shp
# add coordinate info probably
shp1 <- st_transform(shp, "+proj=longlat +ellps=WGS84 +datum=WGS84")
# extract
coords <- as.data.frame(st_coordinates(shp1))
# get a mean lon and lat value in each shape
coords %>%
rename(lon = X, lat = Y, region_id = L3) %>%
left_join(as.data.frame(shp) %>% select(region_id, Acronym), by = 'region_id') %>%
filter(grepl('PO', Acronym)) %>%
# have to have lon on 0:360 so th pacific ocean behaves even though not
# looking at that here
mutate(lon_360 = if_else(lon >=0, lon, lon+360))%>%
group_by(region_id) %>%
summarise(mean_lon = mean(lon_360),
mean_lat = mean(lat)) %>%
ungroup %>%
mutate(mean_lon = if_else(mean_lon >= 0 & mean_lon <= 180,
mean_lon, mean_lon - 360) ) ->
mean_pts_PO
coords %>%
rename(lon = X, lat = Y, region_id = L3) %>%
left_join(as.data.frame(shp) %>% select(region_id, Acronym), by = 'region_id') %>%
filter(!grepl('PO', Acronym)) %>%
# have to have lon on 0:360 so th pacific ocean behaves even though not
# looking at that here
group_by(region_id) %>%
summarise(mean_lon = mean(lon),
mean_lat = mean(lat)) %>%
ungroup %>%
bind_rows(mean_pts_PO)->
mean_pts
# Join to the shape file and make sure this very simple way of
# doing things ends up with a lon lat that is actually in each region
shp %>%
left_join(mean_pts, by = 'region_id') ->
shp
ggplot() +
geom_sf(data = shp ) +
geom_point(data = shp, mapping = aes(x = mean_lon, y = mean_lat), color = 'red') +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =5)
we have 3 variables per observarion = experimentXregion -> convert to rgb values
looking across ESMs, we’ll want to make sure we have them all on consistent range before converting to (0,255)
each color/family of colors does have a physical interpretation in terms of iasd, t, p
red = temperature
blue = precip
iasd = green
# convert to RGB
region_summary %>%
filter(experiment != 'ssp119') ->
region_summary
region_summary$r <- scales::rescale(region_summary$tas_change, to =c(0,255))
region_summary$g <- scales::rescale(region_summary$iasd, to =c(0,255))
region_summary$b <- scales::rescale(region_summary$pr_pct, to =c(0,255))
# add spatial numerical info
region_summary %>%
left_join(as.data.frame(shp) %>% select(Acronym, mean_lon, mean_lat), by = c('region' = 'Acronym')) %>%
rename(lon = mean_lon, lat = mean_lat) %>%
# add the original colors
mutate(orig_color = rgb(.$r, .$g, .$b, maxColorValue = 255)) ->
region_summary
region_numerical <- as.data.frame(region_summary[c('lon', 'lat', 'r', "g", "b")])
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
fig <- plot_ly(region_summary, x = ~tas_change, y = ~iasd, z = ~pr_pct, color = ~orig_color,
colors = region_summary$orig_color)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- plot_ly(region_summary %>% filter(experiment == 'ssp126'), x = ~tas_change, y = ~iasd, z = ~pr_pct, color = ~orig_color,
colors = (region_summary %>% filter(experiment == 'ssp126'))$orig_color)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- plot_ly(region_summary %>% filter(experiment == 'ssp245'), x = ~tas_change, y = ~iasd, z = ~pr_pct, color = ~orig_color,
colors = (region_summary %>% filter(experiment == 'ssp245'))$orig_color)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- plot_ly(region_summary %>% filter(experiment == 'ssp370'), x = ~tas_change, y = ~iasd, z = ~pr_pct, color = ~orig_color,
colors = (region_summary %>% filter(experiment == 'ssp370'))$orig_color)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- plot_ly(region_summary %>% filter(experiment == 'ssp585'), x = ~tas_change, y = ~iasd, z = ~pr_pct, color = ~orig_color,
colors = (region_summary %>% filter(experiment == 'ssp585'))$orig_color)
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
set.seed(11)
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 516"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events <- som.model$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors <- rgb( som.events[,3],
som.events[,4],
som.events[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist <- as.matrix(dist(som.events))
print(head(som.events))
## lon lat r g b
## V1 99.5254 59.167376 163.12832 141.3639 196.3608
## V2 133.8064 55.852860 110.43912 125.9292 168.7815
## V3 150.7777 54.981011 61.90710 112.5471 144.7106
## V4 143.0164 -7.773551 53.72205 122.8078 136.4233
## V5 138.3231 -26.048733 55.77659 137.2779 135.6411
## V6 123.2724 -1.249687 80.17123 148.4979 153.1180
p1<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors) +
theme(legend.position = 'none') +
ggtitle('CAMS, GFDL and CESM2-WACCM')
p1
The default plotting starts bottom left and populates the SOMs in the
order they came out/are in in events
plot(som.model,
type = 'mapping',
bg = som.events.colors,
keepMargins = F,
col = NA,
main = '')
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 516"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model1 <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events1 <- som.model1$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors1 <- rgb( som.events1[,3],
som.events1[,4],
som.events1[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist1 <- as.matrix(dist(som.events1))
p2<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events1), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors1) +
theme(legend.position = 'none')+
ggtitle('CAMS, GFDL and CESM2-WACCM')
p1
p2
ggsave(paste0('python_curation_figs/CAMS_GFDL_CESM2-WACCM_som1_grid', grid.size, '.png'), p1, width = 8, height = 6, units = 'in')
ggsave(paste0('python_curation_figs/CAMS_GFDL_CESM2-WACCM_som2_grid', grid.size, '.png'), p2, width = 8, height = 6, units = 'in')
plot(som.model,
type = 'mapping',
bg = som.events.colors,
keepMargins = F,
col = NA,
main = '')
plot(som.model1,
type = 'mapping',
bg = som.events.colors1,
keepMargins = F,
col = NA,
main = '')
Let’s label each observation with its SOM classification
region_summary$model1_unitclass <- som.model$unit.classif
region_summary$model2_unitclass <- som.model1$unit.classif
region_summary %>%
left_join(data.frame(model1_unitclass = 1:169) %>%
mutate(color1 = som.events.colors),
by='model1_unitclass') %>%
left_join(data.frame(model2_unitclass = 1:169) %>%
mutate(color2 = som.events.colors1),
by='model2_unitclass')->
region_data
print(region_data %>% filter(model1_unitclass == 17))
## [1] esm experiment region iasd
## [5] tas_change pr_pct r g
## [9] b lon lat orig_color
## [13] model1_unitclass model2_unitclass color1 color2
## <0 rows> (or 0-length row.names)
and the original color:
region_data %>%
mutate(orig_color = rgb(.$r, .$g, .$b, maxColorValue = 255)) ->
tmp
ggplot(tmp) + geom_tile(mapping = aes( x = interaction(experiment, esm), y = region, fill = orig_color) ) +
scale_fill_manual(values = tmp$orig_color) +
theme(legend.position='none',
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle('CAMS, GFDL, CESM2-WACCM original colors')
ggplot(tmp) + geom_tile(mapping = aes( x = interaction(experiment, esm), y = region, fill = color1) ) +
scale_fill_manual(values = tmp$color1) +
theme(legend.position='none',
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle('CAMS, GFDL, CESM2-WACCM SOM1 colors')
ggplot(tmp) + geom_tile(mapping = aes( x = interaction(experiment, esm), y = region, fill = color2) ) +
scale_fill_manual(values = tmp$color2) +
theme(legend.position='none',
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle('CAMS, GFDL, CESM2-WACCM SOM2 colors')
Start with SSP126 low temperature changes so should be mostly blues
region_summary %>%
filter(experiment == 'ssp126') ->
region_summary_126
region_numerical <- as.data.frame(region_summary_126[c('lon', 'lat', 'r', "g", "b")])
set.seed(11)
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 129"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events <- som.model$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors <- rgb( som.events[,3],
som.events[,4],
som.events[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist <- as.matrix(dist(som.events))
p1<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors) +
theme(legend.position = 'none') +
ggtitle('CAMS, GFDL and CESM2-WACCM SSP126')
p1
plot(som.model,
type = 'mapping',
bg = som.events.colors,
keepMargins = F,
col = NA,
main = '')
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 129"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model1 <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events1 <- som.model1$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors1 <- rgb( som.events1[,3],
som.events1[,4],
som.events1[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist1 <- as.matrix(dist(som.events1))
p2<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events1), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors1) +
theme(legend.position = 'none')+
ggtitle('CAMS, GFDL and CESM2-WACCM SSP126')
p1
p2
ggsave(paste0('python_curation_figs/CAMS_GFDL_CESM2-WACCM_som1_grid', grid.size, '_ssp126.png'), p1, width = 8, height = 6, units = 'in')
ggsave(paste0('python_curation_figs/CAMS_GFDL_CESM2-WACCM_som2_grid', grid.size, '_ssp126.png'), p2, width = 8, height = 6, units = 'in')
plot(som.model,
type = 'mapping',
bg = som.events.colors,
keepMargins = F,
col = NA,
main = '')
plot(som.model1,
type = 'mapping',
bg = som.events.colors1,
keepMargins = F,
col = NA,
main = '')
Same thing but with lat-lon labels
plot(som.model,
type = 'mapping',
bg = som.events.colors,
keepMargins = F,
col = NA,
main = '')
dimnames(som.model$grid$pts) = list(paste0(as.character(round(as.data.frame(som.events)$lon,0)),
'~',
as.character(round(as.data.frame(som.events)$lat,0)))
, c("lon","lat"))
text(som.model$grid$pts, dimnames(som.model$grid$pts)[[1]])
plot(som.model1,
type = 'mapping',
bg = som.events.colors1,
keepMargins = F,
col = NA,
main = '')
dimnames(som.model1$grid$pts) = list(paste0(as.character(round(as.data.frame(som.events1)$lon,0)),
'~',
as.character(round(as.data.frame(som.events1)$lat,0)))
, c("lon","lat"))
text(som.model1$grid$pts, dimnames(som.model1$grid$pts)[[1]])
Start with CAMS low temperature changes so should be mostly blues
region_summary %>%
filter(esm == 'CAMS-CSM1-0') ->
region_summary_cams
region_numerical <- as.data.frame(region_summary_cams[c('lon', 'lat', 'r', "g", "b")])
set.seed(11)
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 172"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model.cams <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events.cams <- som.model.cams$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors.cams <- rgb( som.events.cams[,3],
som.events.cams[,4],
som.events.cams[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist.cams <- as.matrix(dist(som.events.cams))
p1<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events.cams), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors.cams) +
theme(legend.position = 'none') +
ggtitle('CAMS')
p1
plot(som.model.cams,
type = 'mapping',
bg = som.events.colors.cams,
keepMargins = F,
col = NA,
main = '')
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 172"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model1.cams <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events1.cams <- som.model1.cams$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors1.cams <- rgb( som.events1.cams[,3],
som.events1.cams[,4],
som.events1.cams[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist1.cams <- as.matrix(dist(som.events1.cams))
p2<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events1.cams), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors1.cams) +
theme(legend.position = 'none')+
ggtitle('CAMS')
p1
p2
ggsave(paste0('python_curation_figs/CAMS_som1_grid', grid.size, '.png'), p1, width = 8, height = 6, units = 'in')
ggsave(paste0('python_curation_figs/CAMS_som2_grid', grid.size, '.png'), p2, width = 8, height = 6, units = 'in')
plot(som.model.cams,
type = 'mapping',
bg = som.events.colors.cams,
keepMargins = F,
col = NA,
main = '')
plot(som.model1.cams,
type = 'mapping',
bg = som.events.colors1.cams,
keepMargins = F,
col = NA,
main = '')
region_summary %>%
filter(esm == 'CESM2-WACCM') ->
region_summary_cesm
region_numerical <- as.data.frame(region_summary_cesm[c('lon', 'lat', 'r', "g", "b")])
set.seed(11)
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 172"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model.cesm <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events.cesm <- som.model.cesm$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors.cesm <- rgb( som.events.cesm[,3],
som.events.cesm[,4],
som.events.cesm[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist.cesm <- as.matrix(dist(som.events.cesm))
p1<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events.cesm), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors.cesm) +
theme(legend.position = 'none') +
ggtitle('cesm')
p1
plot(som.model.cesm,
type = 'mapping',
bg = som.events.colors.cesm,
keepMargins = F,
col = NA,
main = '')
#### train the SOM ####
sample.size <- nrow(region_numerical)
print(paste('Num Observations is', sample.size ))
## [1] "Num Observations is 172"
## define a grid for the SOM and train
grid.size <- ceiling(sample.size ^ (1/2.5))
som.grid <- somgrid(xdim = grid.size, ydim = grid.size, topo = 'hexagonal', toroidal = F)
som.model1.cesm <- som(data.matrix(region_numerical), grid = som.grid)
# extract some values to be useful
som.events1.cesm <- som.model1.cesm$codes[[1]]
# assign a color to each event because we have 3 metrics
# red = iasd
# green = temperature change
# blue = precip change
som.events.colors1.cesm <- rgb( som.events1.cesm[,3],
som.events1.cesm[,4],
som.events1.cesm[,5],
maxColorValue = 255)
# calculate a distance matrix
som.dist1.cesm <- as.matrix(dist(som.events1.cesm))
p2<- ggplot() +
geom_sf(data = shp ) +
geom_text(data = shp, mapping = aes(label = Acronym, x = mean_lon, y= mean_lat), size =3) +
geom_point(data = as.data.frame(som.events1.cesm), mapping = aes(x = lon, y = lat, color = interaction(r,g,b)), alpha = 0.75, size = 3) +
scale_color_manual(values = som.events.colors1.cesm) +
theme(legend.position = 'none')+
ggtitle('cesm')
p1
p2
ggsave(paste0('python_curation_figs/cesm_som1_grid', grid.size, '.png'), p1, width = 8, height = 6, units = 'in')
ggsave(paste0('python_curation_figs/cesm_som2_grid', grid.size, '.png'), p2, width = 8, height = 6, units = 'in')
# dig in comparison
Some points are very similar like (46, 14) and (46,13), they are just different SOMs ordered
some are really not (like the mustard points (35, -24) vs (-59,5))
distill into other clusters and see how regions get assigned then plot map by ESM and SSP of those new clusters to see if consistent?
bright green/teal at (65, 59) actually kind of consistent between the two models
plot(som.model.cesm,
type = 'mapping',
bg = som.events.colors.cesm,
keepMargins = F,
col = NA,
main = '')
dimnames(som.model.cesm$grid$pts) = list(paste0(as.character(round(as.data.frame(som.events.cesm)$lon,0)),
'~',
as.character(round(as.data.frame(som.events.cesm)$lat,0)))
, c("lon","lat"))
text(som.model.cesm$grid$pts, dimnames(som.model.cesm$grid$pts)[[1]])
CAMS for comparison
So for each ESM, some of the events are consistent between estimated models and some aren’t
Maybe if we cluster the outputs, it’ll be distilled to the robust part. Then for each ESM-SSP, we can make a map with each region filled in with the coarse cluster value
#### look for a reasonable number of clusters ####
## Evaluate within cluster distances for different values of k. This is
## more dependent on the number of map units in the SOM than the structure
## of the underlying data, but until we have a better way...
## Define a function to calculate mean distance within each cluster. This
## is roughly analogous to the within clusters ss approach
try.k <- 2:60
clusterMeanDist <- function(clusters, som.dist){
cluster.means = c()
for(c in unique(clusters)){
temp.members <- which(clusters == c)
if(length(temp.members) > 1){
temp.dist <- som.dist[temp.members,]
temp.dist <- temp.dist[,temp.members]
cluster.means <- append(cluster.means, mean(temp.dist))
}else(cluster.means <- 0)
}
return(mean(cluster.means))
}
num_clusters <- function(events){
try.k <- 2:60
cluster.dist.eval <- as.data.frame(matrix(ncol = 3, nrow = (length(try.k))))
colnames(cluster.dist.eval) <- c('k', 'kmeans', 'hclust')
som.dist <- as.matrix(dist(events))
for(i in 1:length(try.k)) {
cluster.dist.eval[i, 'k'] <- try.k[i]
cluster.dist.eval[i, 'kmeans'] <- clusterMeanDist(kmeans(events, centers = try.k[i], iter.max = 20)$cluster,
som.dist=som.dist)
cluster.dist.eval[i, 'hclust'] <- clusterMeanDist(cutree(hclust(vegdist(events)), k = try.k[i]),
som.dist=som.dist)
}
return(cluster.dist.eval)
}
cesm1_cluster <- suppressWarnings(num_clusters(events=som.events.cesm))
plot(cesm1_cluster[, 'kmeans'] ~ try.k,
type = 'l')
lines(cesm1_cluster[, 'hclust'] ~ try.k,
col = 'red')
legend('topright',
legend = c('k-means', 'hierarchical'),
col = c('black', 'red'),
lty = c(1, 1))
cesm2_cluster <- suppressWarnings(num_clusters(events=som.events1.cesm))
plot(cesm2_cluster[, 'kmeans'] ~ try.k,
type = 'l')
lines(cesm2_cluster[, 'hclust'] ~ try.k,
col = 'red')
legend('topright',
legend = c('k-means', 'hierarchical'),
col = c('black', 'red'),
lty = c(1, 1))
print(cesm1_cluster[1:20,])
## k kmeans hclust
## 1 2 115.09368 116.88609
## 2 3 101.04137 87.69923
## 3 4 89.26251 70.29772
## 4 5 79.09651 66.54426
## 5 6 71.74314 66.85624
## 6 7 68.25380 36.65696
## 7 8 62.54271 41.18593
## 8 9 58.10822 54.27531
## 9 10 56.97680 54.27531
## 10 11 53.30057 50.90877
## 11 12 47.20858 46.31386
## 12 13 48.06856 50.17945
## 13 14 46.38431 50.17945
## 14 15 0.00000 45.87915
## 15 16 39.80492 45.87915
## 16 17 38.80640 45.87915
## 17 18 38.11367 45.04198
## 18 19 0.00000 45.04198
## 19 20 0.00000 41.68734
## 20 21 30.61355 38.38915
print(cesm2_cluster[1:20,])
## k kmeans hclust
## 1 2 114.78908 84.38540
## 2 3 99.46178 87.38900
## 3 4 84.44001 75.01906
## 4 5 79.21061 71.42349
## 5 6 72.70291 64.65223
## 6 7 66.30695 0.00000
## 7 8 59.32091 0.00000
## 8 9 54.84698 0.00000
## 9 10 55.00760 0.00000
## 10 11 52.19175 0.00000
## 11 12 49.97845 0.00000
## 12 13 47.00709 0.00000
## 13 14 45.16438 0.00000
## 14 15 43.52692 0.00000
## 15 16 40.21425 0.00000
## 16 17 39.61848 0.00000
## 17 18 37.02584 0.00000
## 18 19 35.83780 0.00000
## 19 20 0.00000 0.00000
## 20 21 18.33370 0.00000
6 clusters seems good for CESM
cams1_cluster <- suppressWarnings(num_clusters(events=som.events.cams))
plot(cams1_cluster[, 'kmeans'] ~ try.k,
type = 'l')
lines(cams1_cluster[, 'hclust'] ~ try.k,
col = 'red')
legend('topright',
legend = c('k-means', 'hierarchical'),
col = c('black', 'red'),
lty = c(1, 1))
cams2_cluster <- suppressWarnings(num_clusters(events=som.events1.cams))
plot(cams2_cluster[, 'kmeans'] ~ try.k,
type = 'l')
lines(cams2_cluster[, 'hclust'] ~ try.k,
col = 'red')
legend('topright',
legend = c('k-means', 'hierarchical'),
col = c('black', 'red'),
lty = c(1, 1))
print(cams1_cluster[1:20,])
## k kmeans hclust
## 1 2 96.63535 115.79397
## 2 3 85.10094 85.77567
## 3 4 65.03710 67.26459
## 4 5 57.65432 62.58544
## 5 6 52.54121 54.85245
## 6 7 48.61955 43.31569
## 7 8 46.93191 35.35015
## 8 9 43.23723 35.35015
## 9 10 41.50743 34.90566
## 10 11 40.21328 34.90566
## 11 12 35.36624 29.52258
## 12 13 35.32931 30.07854
## 13 14 33.88767 30.39190
## 14 15 31.68207 29.88608
## 15 16 31.38483 27.34743
## 16 17 29.87922 27.45862
## 17 18 29.09894 26.22720
## 18 19 0.00000 27.35123
## 19 20 21.80350 27.35123
## 20 21 24.94992 26.52608
print(cams2_cluster[1:20,])
## k kmeans hclust
## 1 2 97.88362 103.47237
## 2 3 84.62660 67.66945
## 3 4 65.41690 57.17970
## 4 5 61.09000 54.13661
## 5 6 53.88740 53.04385
## 6 7 50.31881 49.64488
## 7 8 47.12152 43.19777
## 8 9 44.88431 39.30754
## 9 10 42.72430 39.30754
## 10 11 39.28659 33.77423
## 11 12 37.44854 33.77423
## 12 13 35.88324 39.78227
## 13 14 34.36304 39.78227
## 14 15 32.11478 39.78227
## 15 16 30.58219 39.78227
## 16 17 0.00000 38.61507
## 17 18 27.91163 35.32840
## 18 19 27.26736 28.61386
## 19 20 26.24581 28.98457
## 20 21 0.00000 28.98457
7 clusters seems ok for cams - maybe 6 would be as well
#### evaluate clustering algorithms ####
## Having selected a reasonable value for k, evaluate different clustering algorithms.
## Define a function for make a simple plot of clustering output.
## This is the same as previousl plotting, but we define the function
## here as we wanted to play with the color earlier.
plotSOM <- function(som_model, som_colors, clusters){
plot(som_model,
type = 'mapping',
bg = som_colors,
keepMargins = F,
col = NA)
add.cluster.boundaries(som_model, clusters)
}
## Try several different clustering algorithms, and, if desired, different values for k
cluster.tries <- list()
for(k in c(6)){
## k-means clustering
som.cluster.k <- kmeans(som.events.cesm, centers = k, iter.max = 100, nstart = 10)$cluster # k-means
## hierarchical clustering
som.dist <- dist(som.events.cesm) # hierarchical, step 1
som.cluster.h <- cutree(hclust(som.dist), k = k) # hierarchical, step 2
## capture outputs
cluster.tries[[paste0('som.cluster.k.', k)]] <- som.cluster.k
cluster.tries[[paste0('som.cluster.h.', k)]] <- som.cluster.h
}
plotSOM(som_model = som.model.cesm, som_colors = som.events.colors.cesm, clusters = cluster.tries$som.cluster.k.6)
plotSOM(som_model = som.model.cesm, som_colors = som.events.colors.cesm, clusters = cluster.tries$som.cluster.h.6)
region_summary_cesm$model1_unitclass <- som.model.cesm$unit.classif
region_summary_cesm %>%
left_join(data.frame(model1_unitclass = 1:64, k_cluster1 = cluster.tries$som.cluster.k.6), by = 'model1_unitclass') %>%
left_join(data.frame(model1_unitclass = 1:64, h_cluster1 = cluster.tries$som.cluster.h.6), by = 'model1_unitclass')->
clustered
## Try several different clustering algorithms, and, if desired, different values for k
cluster.tries <- list()
for(k in c(6)){
## k-means clustering
som.cluster.k <- kmeans(som.events1.cesm, centers = k, iter.max = 100, nstart = 10)$cluster # k-means
## hierarchical clustering
som.dist <- dist(som.events1.cesm) # hierarchical, step 1
som.cluster.h <- cutree(hclust(som.dist), k = k) # hierarchical, step 2
## capture outputs
cluster.tries[[paste0('som.cluster.k.', k)]] <- som.cluster.k
cluster.tries[[paste0('som.cluster.h.', k)]] <- som.cluster.h
}
plotSOM(som_model = som.model1.cesm, som_colors = som.events.colors1.cesm, clusters = cluster.tries$som.cluster.k.6)
plotSOM(som_model = som.model1.cesm, som_colors = som.events.colors1.cesm, clusters = cluster.tries$som.cluster.h.6)
clustered$model2_unitclass <- som.model1.cesm$unit.classif
clustered %>%
left_join(data.frame(model2_unitclass = 1:64, k_cluster2 = cluster.tries$som.cluster.k.6), by = 'model2_unitclass') %>%
left_join(data.frame(model2_unitclass = 1:64, h_cluster2 = cluster.tries$som.cluster.h.6), by = 'model2_unitclass')->
clustered
shp %>%
left_join(clustered, by = c('Acronym' = 'region')) ->
shp1
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp126'), aes(fill = as.factor(k_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp126'), aes(fill = as.factor(k_cluster2)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp126'), aes(fill = as.factor(h_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp126'), aes(fill = as.factor(h_cluster2)) )
## CESM 245
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp245'), aes(fill = as.factor(k_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp245'), aes(fill = as.factor(k_cluster2)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp245'), aes(fill = as.factor(h_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp245'), aes(fill = as.factor(h_cluster2)) )
## CESM 370
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp370'), aes(fill = as.factor(k_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp370'), aes(fill = as.factor(k_cluster2)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp370'), aes(fill = as.factor(h_cluster1)) )
ggplot() +
geom_sf(data = shp1 %>% filter( experiment == 'ssp370'), aes(fill = as.factor(h_cluster2)) )
ggplot() +
geom_sf(data = shp1 %>% na.omit, aes(fill = as.factor(k_cluster1)) ) +
facet_wrap(~experiment, nrow =2 )
ggplot() +
geom_sf(data = shp1 %>% na.omit, aes(fill = as.factor(k_cluster2)) ) +
facet_wrap(~experiment, nrow =2 )
## Clustering conclusions
Abstracts TOO much and is meaningless: all 6 clusters show up for each SSP, which means we aren’t extracting significant insight.
By ESM and SSP
Because we put the data all on the same scale initially, a blue in CAMs SOM is the same as the same blue in the CESM som.
The HARD part is getting the SOM-specific colors on the maps
region_summary_cesm$model1_unitclass <- som.model.cesm$unit.classif
region_summary_cesm$model2_unitclass <- som.model1.cesm$unit.classif
shp %>%
left_join(region_summary_cesm, by = c('Acronym' = 'region')) ->
shp_cesm
ggplot() +
geom_sf(data = shp_cesm %>% na.omit, aes(fill = as.factor(model1_unitclass)) ) +
scale_fill_manual(values = som.events.colors.cesm)+
facet_wrap(~experiment, nrow =2 ) +
ggtitle('CESM, model1')
ggplot() +
geom_sf(data = shp_cesm %>% na.omit, aes(fill = as.factor(model2_unitclass)) ) +
scale_fill_manual(values = som.events.colors1.cesm)+
facet_wrap(~experiment, nrow =2 ) +
ggtitle('CESM, model2')
# not evaluated
plotter <- function(events, colors){
data <- as.data.frame(events)
data$color <- colors
lons <- data.frame('lon' = sort(unique(data$lon))) %>% mutate(lon_order = as.integer(row.names(.)))
lats <- data.frame('lat' = sort(unique(data$lat))) %>% mutate(lat_order = as.integer(row.names(.)))
data %>%
left_join(lons, by = 'lon') %>%
left_join(lats, by = 'lat') %>%
arrange(lon, lat) %>%
mutate(plot_order = as.integer(row.names(.))) ->
data_plot
p <- ggplot(cesm1_plot) +
geom_point(mapping = aes(x = lon_order, y = 1, color = color), size = 3) +
facet_wrap(~plot_order, nrow = sqrt(nrow(data))) +
scale_color_manual(values = data_plot$color) +
theme_bw()+
theme(legend.position = 'none',
strip.background = element_blank(),
strip.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(p)
}
cesm_1<- as.data.frame(som.events.cesm)
cesm_1$color <- som.events.colors.cesm
lons <- data.frame('lon' = sort(unique(cesm_1$lon))) %>% mutate(lon_order = as.integer(row.names(.)))
lats <- data.frame('lat' = sort(unique(cesm_1$lat))) %>% mutate(lat_order = as.integer(row.names(.)))
cesm_1 %>%
left_join(lons, by = 'lon') %>%
left_join(lats, by = 'lat') %>%
arrange(lon, lat) %>%
mutate(plot_order = as.integer(row.names(.))) ->
cesm1_plot
p1<- ggplot(cesm1_plot) +
geom_point(mapping = aes(x = lon_order, y = lat_order, color = color), size = 3) +
facet_wrap(~lat_order, nrow = 8) +
#facet_grid(lon_order~lat_order) +
scale_color_manual(values = cesm1_plot$color) +
theme_bw()+
theme(legend.position = 'none',
strip.background = element_blank(),
strip.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
p <- plotter(events = som.events.cesm, colors = som.events.colors.cesm)
p
p1
cesm_2<- as.data.frame(som.events1.cesm)
cesm_2$color <- som.events.colors1.cesm
cams_1<- as.data.frame(som.events.cesm)
cams_1$color <- som.events.colors.cesm
cesm_2<- as.data.frame(som.events1.cesm)
cesm_2$color <- som.events.colors1.cesm
Haven’t figured out if the same ‘seafoam’ event in the CESM grid is actually the same underlying lat lon and it’s just ignoring lat lon for plotting. The maps I have plotted make me think no
ALSO the kohonen package only comes with toroidal and
non-toroidal grids. TO equate -180 and 180 lon, have to equate -90 and
90 lat which is for sure wrong